#include<iostream>
#include<arm_neon.h>
using namespace std;
float32_t nums[16] = { 1,2,3,4,1,2,3,4};
int main() {
    for(int i=0;i<3;++i){
        __asm__ __volatile__(
		"ldr	q0, [%0];\n"
        "ldr	q1, [%1];\n"
        "fadd	v0.4s, v1.4s, v0.4s;\n"
        "str	q0, [%2]\n"
        :
		:"r"(&nums[0]),"r"(&nums[4]),"r"(&nums[8])
        :"memory","q0","q1"
        );
        cout<<nums[4]<<" "<<nums[5]<<" "<<nums[6]<<" "<<nums[7]<<endl;
    }
}